from BasicLibraries import *
import Functions.Miscellaneous.utils as ut
import allocation_ga as af
from scipy import spatial


AM = af.AllocationGAD()



ONEDRIVE = ''

def process_data_to_return_object(permutation_test_type, population_type, feature_type, 
                                  tmp_outputs,
                                  final_year = 2021):
    #== Load the population data
    dt  = pd.read_csv(population_type, low_memory= False)
    dt['rfyear'] = dt['rfyear'].astype(int).astype(str)
    dt = dt[dt.next_year_performance > -3]
    dt = dt.rename(columns = {'GICS_level_1_x': 'GICS_level_1',
                              'GICS_level_2_x': 'GICS_level_2',
                              'GICS_level_3_x': 'GICS_level_3'})  
    
    #== Creat the results objects
    res, prf = pd.DataFrame(), pd.DataFrame()
    optimaTMP = []
    observedTMP = []
    feature_importance  = dict()
    feature_irrelevance = dict()
    max_year = final_year+1
    min_year =max_year-4

    for Y in range(min_year, max_year):
        resultsDICTIONARY, relevant_features, irrelevant_features, \
        company_score, observed_, optima_, \
            observed_dict, optima_dict, gvkey_list, reduced_dt = \
                pickle.load(open(tmp_outputs+str(Y)+'.pckl', 'rb'))
        feature_importance[str(Y)] =  relevant_features     
        feature_irrelevance[str(Y)] =  irrelevant_features       
        #== Create the weighted representation    
        company_score['mrg'] = ut.utils().make_mrg(company_score, 'rfyear')
        reduced_dt['mrg'] = ut.utils().make_mrg(reduced_dt, 'rfyear')
        
        #== Choose the type of feature importance approach and make sure you preserve alignment
        identifiers = list(optima_dict.keys())
        optimised_firms = reduced_dt[reduced_dt.gvkey.isin(optima_dict.keys())]
        optimised_firms = optimised_firms[optimised_firms.rfyear == Y]
        
        #== Assign names to the matrices
        optima_tmp = dict()
        observed_tmp = dict()
        z = company_score[company_score.rfyear == Y]
        for i in identifiers:
            optima_tmp[i] = optima_dict[i]
            observed_tmp[i] = observed_dict[i]
            optima_tmp[i].name = z[z.gvkey == i].mrg.iloc[0]             ## Binary optimum
            observed_tmp[i].name = z[z.gvkey == i].mrg.iloc[0]           ## Binary observed
        optimaTMP.extend(optima_tmp[i] for i in identifiers)
        observedTMP.extend(observed_tmp[i] for i in identifiers)
    
        #== Performance dataframe
        a = dt[dt.rfyear == str(Y)]
        company_score = company_score.merge(a[['gvkey', 'next_year_non_financial_performance', 'next_year_financial_performance', 'next_year_performance', 'number_of_initiatives', 'GICS_level_1', 'GICS_level_2', 'GICS_level_3']], on = 'gvkey')
        res = pd.concat((res, company_score))
        for i in range(len(resultsDICTIONARY)):
            tmp = AM.get_performances(resultsDICTIONARY, i)
            if type(tmp) != int:
                tmp['rfyear'] = Y
                prf = pd.concat((prf, tmp))
    
    #== Performance dataframe
    prf = prf.reset_index(drop = True)
    prf = prf.rename(columns = {'index': 'gvkey'})
    prf['Scarto'] = (prf.OptimalPerformance - prf.ExpectedPerformance)
    prf['mrg'] = prf['gvkey'].astype(int).astype(str)+'-'+prf['rfyear'].astype(int).astype(str)
    res = res.merge(prf[['ExpectedPerformance', 'OptimalPerformance', 'mrg']], on = 'mrg')
    #== Better score factor
    res = res.drop(columns = ['score'])
    res['score'] = 1./(100*(res['OptimalPerformance'].abs()-res['ExpectedPerformance'].abs()))*1/(1+res['distance_from_opt'])
    
    #== Adjustment to the results dataframe
    res = res.replace([np.inf], [np.nan])    
    idx = res[['gvkey', 'rfyear']].groupby('gvkey').count()

    #== Choose the minimum number of years to be included in the sample
    minimum_number_of_years = 2
    res =res[res.gvkey.isin(idx[idx >= minimum_number_of_years].dropna().index)]
    #==

    res = res.reset_index(drop=True)
    res['size_q'] = pd.qcut(res['firm_size' ], 4, labels = False)
    res = res[res.score > res.score.quantile(0.01)]
    res = res[res.next_year_performance > res.next_year_performance.quantile(0.005)]
    
    #== Only focus on those behaviour within the results dataframe and create the new optimality measures
    optima,   observed  = [], [] 
    for i in range(len(optimaTMP)):
        if optimaTMP[i].name in res.mrg.values.tolist():
            optima.append(optimaTMP[i])
            observed.append(observedTMP[i])
    return dt, res, prf, optima,  observed,   feature_importance, feature_irrelevance
